{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "songs = pd.read_csv('data/drake-songs.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "songs = songs[songs['album'] == 'thank-me-later']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "37375"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text = ''\n",
    "\n",
    "for index, row in songs['lyrics'].iteritems():\n",
    "    text = text + str(row).lower().replace('|-|', '\\n')\n",
    "    \n",
    "len(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "corpus_raw = text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "words = re.findall(r\"\\b[A-z']+\\b\", corpus_raw)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7482"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "int2word = dict()\n",
    "word2int = dict()\n",
    "\n",
    "uniq_words = set(words)\n",
    "vocab_len = len(uniq_words)\n",
    "\n",
    "for i, word in enumerate(uniq_words):\n",
    "    int2word[i] = word\n",
    "    word2int[word] = i"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "sentences = []\n",
    "\n",
    "for sentence in corpus_raw.split('\\n'):\n",
    "    sentences.append(re.findall(r\"\\b[A-z']+\\b\", sentence))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "947"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(sentences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "data =[]\n",
    "window = 2\n",
    "\n",
    "for sentence in sentences:\n",
    "    for i, center_word in enumerate(sentence):\n",
    "        for word in sentence[max(0, i - window) : min(i + window, len(sentence))]:\n",
    "            if center_word != word:\n",
    "                data.append([center_word, word])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "18251"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_oh_vector(data_point_index, vocab_size):\n",
    "    temp = np.zeros(vocab_size)\n",
    "    temp[data_point_index] = 1\n",
    "    return temp\n",
    "\n",
    "x_train = []\n",
    "y_train = []\n",
    "\n",
    "for data_word in data:\n",
    "    x_train.append(get_oh_vector(word2int[data_word[0]], vocab_len))\n",
    "    y_train.append(get_oh_vector(word2int[data_word[1]], vocab_len))\n",
    "\n",
    "x_train = np.asarray(x_train)\n",
    "y_train = np.asarray(y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "from unidecode import unidecode\n",
    "from keras.layers import Dense\n",
    "from keras.models import Sequential\n",
    "from keras.optimizers import Adam"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "encoding_size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(18251, 18251)"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = Sequential()\n",
    "model.add(Dense(encoding_size, input_shape=(x_train.shape[0], ), activation='relu'))\n",
    "model.add(Dense(y_train.shape[0], activation='softmax'))\n",
    "model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "dense_31 (Dense)             (None, 3)                 54756     \n",
      "_________________________________________________________________\n",
      "dense_32 (Dense)             (None, 18251)             73004     \n",
      "=================================================================\n",
      "Total params: 127,760\n",
      "Trainable params: 127,760\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/10\n",
      " 4064/18251 [=====>........................] - ETA: 28s - loss: 9.7201 - acc: 0.0438"
     ]
    }
   ],
   "source": [
    "model.fit(x_train, y_train, epochs=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
